Data for this analysis has been sourced from Inside Airbnb for London, UK on 06 September, 2023.
# Load data
import pandas as pd
df = pd.read_csv(r"C:\Users\Dell\OneDrive - Trinity College Dublin\Desktop\listings.csv")
df.head()
import sweetviz as sv
from IPython.core.display import display, HTML
# Analyze the dataframe
analyze_report = sv.analyze(df)
# Use display to view report directly in Jupyter Notebook
display(HTML(analyze_report.show_html('report.html', open_browser=False)))
C:\Users\Dell\AppData\Local\Temp\ipykernel_112\4252467769.py:3: DtypeWarning: Columns (68) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv(r"C:\Users\Dell\OneDrive - Trinity College Dublin\Desktop\listings.csv") C:\Users\Dell\AppData\Local\Temp\ipykernel_112\4252467769.py:6: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display from IPython.core.display import display, HTML
| | [ 0%] 00:00 ->…
C:\Users\Dell\anaconda3\Lib\site-packages\sweetviz\series_analyzer.py:17: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. value_counts_without_nan = pd.Series() C:\Users\Dell\anaconda3\Lib\site-packages\sweetviz\series_analyzer.py:17: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. value_counts_without_nan = pd.Series() C:\Users\Dell\anaconda3\Lib\site-packages\sweetviz\series_analyzer.py:17: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. value_counts_without_nan = pd.Series()
Report report.html was generated.
<IPython.core.display.HTML object>
# Display the report directly in Jupyter Notebook
analyze_report.show_notebook()
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
C:\Users\Dell\AppData\Local\Temp\ipykernel_112\1531488561.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
# Calculate average, minimum, maximum, median, and 3rd quartile prices for all of London
average_price = df['price'].mean()
min_price = df['price'].min()
max_price = df['price'].max()
median_price = df['price'].quantile(0.5)
third_quartile_price = df['price'].quantile(0.75)
print(f"Overall in London:")
print(f"Average Price: ${average_price:.2f}")
print(f"Minimum Price: ${min_price:.2f}")
print(f"Maximum Price: ${max_price:.2f}")
print(f"Median Price: ${median_price:.2f}")
print(f"3rd Quartile Price: ${third_quartile_price:.2f}")
Overall in London: Average Price: $181.35 Minimum Price: $0.00 Maximum Price: $80100.00 Median Price: $110.00 3rd Quartile Price: $193.00
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming df2 is your dataframe
# Calculate the IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter out the outliers
df_no_outliers = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
# Set style
sns.set_style("whitegrid")
# Creating a subplot of 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
# Histogram
sns.histplot(df_no_outliers['price'], ax=axes[0], bins=30, kde=False, color='skyblue')
axes[0].set_title('Histogram of Prices' ,fontsize=17)
axes[0].set_xlabel('Price',fontsize=16)
axes[0].set_ylabel('Frequency',fontsize=16)
axes[0].tick_params(axis='both', labelsize=14)
# Boxplot
sns.boxplot(df_no_outliers['price'], ax=axes[1], color='lightgreen')
axes[1].set_title('Boxplot of Prices',fontsize=17)
axes[1].set_xlabel('Price',fontsize=16)
axes[1].tick_params(axis='both', labelsize=14)
# KDE
sns.kdeplot(df_no_outliers['price'], ax=axes[2], shade=True, color='salmon')
axes[2].set_title('Kernel Density Estimation of Prices',fontsize=17)
axes[2].set_xlabel('Price',fontsize=16)
axes[2].set_ylabel('Density',fontsize=16)
axes[2].tick_params(axis='both', labelsize=14)
plt.tight_layout()
plt.show()
C:\Users\Dell\AppData\Local\Temp\ipykernel_112\538630741.py:40: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code. sns.kdeplot(df_no_outliers['price'], ax=axes[2], shade=True, color='salmon')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Sample data frame. Replace this with loading your dataset
# df2 = pd.read_csv('your_dataset_path.csv')
# Group data by neighbourhood_cleansed and calculate the average number_of_reviews and average price for each neighbourhood
grouped_data = df.groupby('neighbourhood_cleansed').agg({
'number_of_reviews': 'mean',
'price': 'mean'
}).sort_values(by='number_of_reviews', ascending=False)
# Normalize the average price values to create a colormap
norm = plt.Normalize(grouped_data['price'].min(), grouped_data['price'].max())
cmap = plt.get_cmap("YlOrRd")
# Color mapping
bar_colors = cmap(norm(grouped_data['price'].values))
# Color mapping
bar_colors = cmap(norm(grouped_data['price'].values))
# Visualization for Neighbourhood Popularity with bars colored by average price
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(grouped_data.index, grouped_data['number_of_reviews'], color=bar_colors, width=0.6) # Adjust the width for thinner bars
ax.set_title("Neighbourhoods by Average Number of Reviews")
ax.set_ylabel("Average Number of Reviews")
ax.set_xlabel("Neighbourhood")
plt.xticks(rotation=45, ha='right')
# Add a colorbar to the right
cbar = fig.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, orientation='vertical', label='Average Price ($)')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Group by room_type and calculate both average price and average number of reviews
grouped_data = df.groupby('room_type').agg({
'price': 'mean',
'number_of_reviews': 'mean'
}).reset_index()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Group by room_type and calculate both average price and average number of reviews
grouped_data = df.groupby('room_type').agg({
'price': 'mean',
'number_of_reviews': 'mean'
}).reset_index()
# Visualization
plt.figure(figsize=(7, 5))
# Using seaborn's barplot for the grouped data
ax = sns.barplot(data=grouped_data, x='room_type', y='number_of_reviews', palette='viridis', hue='price', dodge=False)
# Formatting legend labels to have two decimal places
legend_labels, _ = ax.get_legend_handles_labels()
ax.legend(legend_labels, [f"${float(price):.2f}" for price in _], title='Average Price ($)')
plt.title("Popularity and Average Price based on Room Type")
plt.ylabel("Average Number of Reviews")
plt.xlabel("Room Type")
plt.tight_layout()
plt.show()
The hotel rooms tend to be more expensive than other room types with a price of 256$ whereas private rooms are almost 60% cheaper than a hotel room however hotel rooms seem to be more popular than private rooms based on the number of reviews
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate the average availabilities
avg_availabilities = df.agg({
'availability_30': 'mean',
'availability_60': 'mean',
'availability_90': 'mean',
'availability_365': 'mean'
})
# Setting up the subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
# Plot 1: Countplot for has_availability values on the first subplot
sns.countplot(data=df, x='has_availability', palette='viridis', ax=axes[0])
axes[0].set_title('Count of Listings by Availability Status',fontsize=15)
axes[0].set_ylabel('Number of Listings',fontsize=14)
axes[0].set_xlabel('Has Availability',fontsize=14)
axes[0].tick_params(axis='both', labelsize=14)
# Plot 2: Barplot for average availabilities on the second subplot
avg_availabilities.plot(kind='bar', color='c', ax=axes[1])
axes[1].set_title('Average Availability Over Different Durations',fontsize=15)
axes[1].set_ylabel('Average Days Available',fontsize=14)
axes[1].set_xlabel('Availability Duration',fontsize=14)
axes[1].tick_params(axis='x', rotation=0) # Set rotation to 0 for x-axis labels
axes[1].tick_params(axis='both', labelsize=14)
# Adjusting for better legibility
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0, fontsize=10)
# Tight layout to ensure no overlap
plt.tight_layout()
plt.show()
#Printing availability
print(avg_availabilities)
availability_30 7.821663 availability_60 18.679974 availability_90 31.366680 availability_365 121.112808 dtype: float64
We can see that on average the listings are available onely one third of any given time window and more than 90% of the listings are available when the data was collected(scraped).
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# If the listing is instantly bookable, replace "N/A" values in host_response_time to 'within an hour'
df.loc[(df['instant_bookable'] == 't') & (df['host_response_time'] == 'N/A'), 'host_response_time'] = 'within an hour'
# Grouping by host_response_time and calculating the average number_of_reviews
grouped_data = df.groupby('host_response_time')['number_of_reviews'].mean().reset_index()
# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data, x='host_response_time', y='number_of_reviews', palette='viridis')
plt.title('Impact of Host Response Time on Bookings')
plt.xlabel('Host Response Time')
plt.ylabel('Average Number of Reviews')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print(grouped_data)
host_response_time number_of_reviews 0 a few days or more 6.066195 1 within a day 14.098164 2 within a few hours 20.001935 3 within an hour 25.264985
The average number of reviews are the highest - around 25 where the host response time is less than an hour. The number of reviews decrease proportional with increase in response time.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Grouping by host_identity_verified and calculating the average number_of_reviews and review_scores_rating
grouped_data = df.groupby('host_identity_verified').agg({
'number_of_reviews': 'mean',
'review_scores_rating': 'mean'
}).reset_index()
# Visualization
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
# Average number of reviews plot
sns.barplot(data=grouped_data, x='host_identity_verified', y='number_of_reviews', palette='viridis', ax=ax[0])
ax[0].set_title('Impact of Verification on Average Number of Reviews',fontsize=15)
ax[0].set_xlabel('Host Identity Verified',fontsize=14)
ax[0].set_ylabel('Average Number of Reviews',fontsize=14)
ax[0].tick_params(axis='both', labelsize=14)
# Average review_scores_rating plot
sns.barplot(data=grouped_data, x='host_identity_verified', y='review_scores_rating', palette='viridis', ax=ax[1])
ax[1].set_title('Impact of Verification on Average Review Score Rating',fontsize=15)
ax[1].set_xlabel('Host Identity Verified',fontsize=14)
ax[1].set_ylabel('Average Review Score Rating',fontsize=14)
ax[1].tick_params(axis='both', labelsize=14)
plt.tight_layout()
plt.show()
print(grouped_data)
host_identity_verified number_of_reviews review_scores_rating 0 f 6.871379 4.370692 1 t 19.590804 4.621971
The average number of reviews are around 20 when the host status is verified and even the average score of listing tends to be higher when the host is verified.
import pandas as pd
# Remove unwanted characters and split the amenities column by ","
amenities_series = df['amenities'].str.strip('[]').str.replace('"', '').str.split(', ')
amenities_count = amenities_series.explode().value_counts()
import seaborn as sns
import matplotlib.pyplot as plt
top_amenities = amenities_count.head(10)
plt.figure(figsize=(12, 6))
sns.barplot(y=top_amenities.index, x=top_amenities.values, palette='viridis')
plt.title('Top 10 Most Frequent Amenities')
plt.xlabel('Frequency')
plt.ylabel('Amenity')
plt.show()
print(amenities_count)
Kitchen 80119
Wifi 79856
Smoke alarm 78620
Essentials 73523
Iron 62614
...
Four oven 1
UE Boom Bluetooth sound system 1
Smeg stainless steel gas stove 1
Electric hob electric stove 1
Paul Mitchell shampoo 1
Name: amenities, Length: 7291, dtype: int64
Kitchen is most commonly listed amenity and it listed 80119 times out 87946 times followed by wifi as the most common amenity.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Sample data
# df = pd.read_csv("your_dataset.csv")
# Convert price to float type (assuming it might be a string with currency symbols)
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
# Calculate the number of amenities for each listing
# Assuming amenities are stored as a list or array-like structure in a string format
df['amenities_count'] = df['amenities'].str.split(',').str.len()
def filter_outliers(data, column_name):
"""Function to filter out outliers based on IQR for a given column."""
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)
IQR = Q3 - Q1
return data[~((data[column_name] < (Q1 - 1.5 * IQR)) | (data[column_name] > (Q3 + 1.5 * IQR)))]
# Filter out outliers for both 'price' and 'amenities_count'
df_filtered = filter_outliers(df, 'price')
df_filtered = filter_outliers(df_filtered, 'amenities_count')
# Calculate the correlation between price and the number of amenities
correlation = df_filtered[['price', 'amenities_count']].corr().iloc[0, 1]
print(f"Correlation between price and number of amenities: {correlation}")
# Scatter plot with regression line
plt.figure(figsize=(10,6))
sns.regplot(x=df_filtered['amenities_count'], y=df_filtered['price'], scatter_kws={'s':10, 'alpha':0.5}, line_kws={'color':'red'})
plt.title('Scatter Plot with Regression Line of Number of Amenities vs Price (Outliers Removed)')
plt.xlabel('Number of Amenities')
plt.ylabel('Price')
plt.show()
Correlation between price and number of amenities: 0.29270030492598725
Location Value: Properties closer to the city center or specific high-demand areas may be priced higher. These areas might offer better amenities, connectivity, or prestige.
Pricing Strategy: Properties in far-off neighborhoods are relatively cheaper. This can be due to factors such as distance from the city center, fewer amenities, or less developed infrastructure.
Potential Areas of Interest: Identify regions where property prices are surprisingly low or high. This can indicate undervalued or overvalued properties, or the influence of other external factors not captured in the dataset.
import folium
import branca
# Filter rows with price till 9th decile
price_threshold = df['price'].quantile(0.9)
df2_filtered = df[df['price'] <= price_threshold]
# Create a base map
m = folium.Map(location=[51.5074, -0.1278], zoom_start=10, tiles='cartodb positron')
# Define the mid-point for the color scale (median price)
min_price = df2_filtered['price'].min()
midpoint = df2_filtered['price'].median()
max_price = df2_filtered['price'].max()
# Create a diverging color map based on property prices
colormap = branca.colormap.LinearColormap(colors=['blue', 'yellow', 'red'],
index=[min_price, midpoint, max_price],
vmin=min_price,
vmax=max_price)
# Add CircleMarkers to the map with gradient colors based on price
for idx, row in df2_filtered.iterrows():
folium.CircleMarker(location=(row['latitude'], row['longitude']),
radius=0.5,
color=colormap(row['price']),
fill=True,
fill_opacity=0.4).add_to(m)
# Add the color map legend to the map
colormap.caption = 'Property Prices'
colormap.add_to(m)
m